In [1]:
#This project aims to analyse a kaggle dataset containing professional football players across the top 5 European leagues

#The data contains the player names, position, team, age and their stats for the season

#My goal is to clean the dataset, perform feature engineering,
# create visuals to showcase the best performing players and teams, the age and nationality differences,
# and finally to implement machine learning algorithms based on my work, with the aim to create an accurate predictive model
In [2]:
#Import file

import pandas as pd
import numpy as np

inpath = "C:/Users/ryans/Downloads/"
filename = "2022-2023 Football Player Stats.csv"
players_df = pd.read_csv(inpath + filename, delimiter=';', header=0, index_col=None, encoding='cp1252')
players_df
Out[2]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... Off Crs TklW PKwon PKcon OG Recov AerWon AerLost AerWon%
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 0.17 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 0.05 0.18 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 0.00 1.05 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.00 0.35 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 0.00 0.23 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2684 2685 Szymon ?urkowski POL MF Fiorentina Serie A 25 1997 2 0 ... 0.00 2.50 0.00 0.0 0.0 0.00 5.00 2.50 2.50 50.0
2685 2686 Szymon ?urkowski POL MF Spezia Serie A 25 1997 1 0 ... 0.00 0.00 0.00 0.0 0.0 0.00 30.00 0.00 0.00 0.0
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 0.05 2.51 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9
2687 2688 Milan ?uri? BIH FW Hellas Verona Serie A 32 1990 16 7 ... 0.38 0.00 0.13 0.0 0.0 0.00 2.05 14.50 3.59 80.1
2688 2689 Filip ?uri?i? SRB MFFW Sampdoria Serie A 31 1992 20 17 ... 0.00 1.56 0.88 0.0 0.0 0.00 4.97 0.54 0.88 38.1

2689 rows × 124 columns

In [3]:
#Data cleaning
In [4]:
#isna shows how many values are na

players_df.isna().sum(axis=0)
Out[4]:
Rk         0
Player     0
Nation     1
Pos        0
Squad      0
          ..
OG         0
Recov      0
AerWon     0
AerLost    0
AerWon%    0
Length: 124, dtype: int64
In [5]:
#Display unique player names, check for any anomalies/outliers
print("Unique names:", players_df['Player'].unique())
print('\n')

# Ensure all positions are valid
print("Unique positions:", players_df['Pos'].unique())
print('\n')

#How many competitions/leagues are present in the dataset
leagues = players_df['Comp'].unique()
print("Leagues:", leagues)
Unique names: ['Brenden Aaronson' 'Yunis Abdelhamid' 'Himad Abdelli' ...
 'Martin Ødegaard' 'Milan ?uri?' 'Filip ?uri?i?']


Unique positions: ['MFFW' 'DF' 'MF' 'FWMF' 'FW' 'DFFW' 'MFDF' 'GK' 'DFMF' 'FWDF']


Leagues: ['Premier League' 'Ligue 1' 'Serie A' 'La Liga' 'Bundesliga']
In [6]:
#Remove players whose names contain '?' as the data is dirty and not accurate, notice 105 rows are removed
players_df = players_df[~players_df['Player'].str.contains('\?')]
players_df
Out[6]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... Off Crs TklW PKwon PKcon OG Recov AerWon AerLost AerWon%
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 0.17 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 0.05 0.18 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 0.00 1.05 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.00 0.35 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 0.00 0.23 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2680 2681 Kurt Zouma FRA DF West Ham Premier League 28 1994 15 15 ... 0.00 0.00 0.14 0.0 0.0 0.00 3.94 2.61 0.35 88.1
2681 2682 Petar Zovko BIH GK Spezia Serie A 20 2002 1 0 ... 0.00 0.00 0.00 0.0 0.0 0.00 2.50 0.00 0.00 0.0
2682 2683 Igor Zubeldia ESP DF Real Sociedad La Liga 25 1997 16 14 ... 0.00 0.21 0.86 0.0 0.0 0.00 4.93 2.00 1.57 56.0
2683 2684 Martín Zubimendi ESP MF Real Sociedad La Liga 24 1999 19 18 ... 0.06 0.11 1.07 0.0 0.0 0.00 6.18 2.19 0.90 70.9
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 0.05 2.51 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9

2584 rows × 124 columns

In [7]:
#Analyse Age values for outliers
#Interquartile Range method to find outliers

Q1 =players_df['Age'].quantile(0.25)
Q3 = players_df['Age'].quantile(0.75)
IQR = Q3 - Q1


#Define an outlier as being beyond 1.5 times the IQR from the quartiles
IQR = players_df['Age'].quantile(0.75) - players_df['Age'].quantile(0.25)
lower_bound = players_df['Age'].quantile(0.25) - 1.5 * IQR
upper_bound = players_df['Age'].quantile(0.75) + 1.5 * IQR

print('lower bound is:', lower_bound)
print('higher bound is:', upper_bound)
lower bound is: 14.0
higher bound is: 38.0
In [8]:
#Remove players that fall outside the age range of 14-38
#Notice 10 rows are removed

filtered_players_df = players_df[(players_df['Age'] >= lower_bound) & (players_df['Age'] <= upper_bound)]
filtered_players_df
Out[8]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... Off Crs TklW PKwon PKcon OG Recov AerWon AerLost AerWon%
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 0.17 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 0.05 0.18 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 0.00 1.05 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.00 0.35 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 0.00 0.23 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2680 2681 Kurt Zouma FRA DF West Ham Premier League 28 1994 15 15 ... 0.00 0.00 0.14 0.0 0.0 0.00 3.94 2.61 0.35 88.1
2681 2682 Petar Zovko BIH GK Spezia Serie A 20 2002 1 0 ... 0.00 0.00 0.00 0.0 0.0 0.00 2.50 0.00 0.00 0.0
2682 2683 Igor Zubeldia ESP DF Real Sociedad La Liga 25 1997 16 14 ... 0.00 0.21 0.86 0.0 0.0 0.00 4.93 2.00 1.57 56.0
2683 2684 Martín Zubimendi ESP MF Real Sociedad La Liga 24 1999 19 18 ... 0.06 0.11 1.07 0.0 0.0 0.00 6.18 2.19 0.90 70.9
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 0.05 2.51 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9

2574 rows × 124 columns

In [9]:
#Filter the DataFrame to include only rows where MP (matches played) is 5 or greater
#This removes rotational players and those who have maybe struggled with injury

filtered_players_df = filtered_players_df[filtered_players_df['MP'] >= 5]
filtered_players_df
Out[9]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... Off Crs TklW PKwon PKcon OG Recov AerWon AerLost AerWon%
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 0.17 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 0.05 0.18 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 0.00 1.05 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.00 0.35 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 0.00 0.23 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2678 2679 Nadir Zortea ITA DFMF Atalanta Serie A 23 1999 9 1 ... 0.00 4.00 2.00 0.0 0.0 0.00 5.60 2.00 0.40 83.3
2680 2681 Kurt Zouma FRA DF West Ham Premier League 28 1994 15 15 ... 0.00 0.00 0.14 0.0 0.0 0.00 3.94 2.61 0.35 88.1
2682 2683 Igor Zubeldia ESP DF Real Sociedad La Liga 25 1997 16 14 ... 0.00 0.21 0.86 0.0 0.0 0.00 4.93 2.00 1.57 56.0
2683 2684 Martín Zubimendi ESP MF Real Sociedad La Liga 24 1999 19 18 ... 0.06 0.11 1.07 0.0 0.0 0.00 6.18 2.19 0.90 70.9
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 0.05 2.51 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9

1978 rows × 124 columns

In [10]:
#Display the first few rows of the dataset
print("Data Head:")
print('\n')
print(filtered_players_df.head())
print('\n') 

#Show basic information about the dataset
print("Data Info:")
print('\n')
print(filtered_players_df.info())
print('\n')

#Display the shape of the dataset (rows, columns)
print("Dataframe shape:", filtered_players_df.shape)
Data Head:


   Rk             Player Nation   Pos         Squad            Comp  Age  \
0   1   Brenden Aaronson    USA  MFFW  Leeds United  Premier League   22   
1   2   Yunis Abdelhamid    MAR    DF         Reims         Ligue 1   35   
2   3      Himad Abdelli    FRA  MFFW        Angers         Ligue 1   23   
3   4  Salis Abdul Samed    GHA    MF          Lens         Ligue 1   22   
4   5    Laurent Abergel    FRA    MF       Lorient         Ligue 1   30   

   Born  MP  Starts  ...   Off   Crs  TklW  PKwon  PKcon    OG  Recov  AerWon  \
0  2000  20      19  ...  0.17  2.54  0.51    0.0    0.0  0.00   4.86    0.34   
1  1987  22      22  ...  0.05  0.18  1.59    0.0    0.0  0.00   6.64    2.18   
2  1999  14       8  ...  0.00  1.05  1.40    0.0    0.0  0.00   8.14    0.93   
3  2000  20      20  ...  0.00  0.35  0.80    0.0    0.0  0.05   6.60    0.50   
4  1993  15      15  ...  0.00  0.23  2.02    0.0    0.0  0.00   6.51    0.31   

   AerLost  AerWon%  
0     1.19     22.2  
1     1.23     64.0  
2     1.05     47.1  
3     0.50     50.0  
4     0.39     44.4  

[5 rows x 124 columns]


Data Info:


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1978 entries, 0 to 2686
Columns: 124 entries, Rk to AerWon%
dtypes: float64(112), int64(7), object(5)
memory usage: 1.9+ MB
None


Dataframe shape: (1978, 124)
In [11]:
#Basic statistics for numerical features
print(filtered_players_df.describe())
                Rk          Age         Born           MP       Starts  \
count  1978.000000  1978.000000  1978.000000  1978.000000  1978.000000   
mean   1330.520222    26.342770  1995.822042    14.721941    10.746208   
std     774.631741     4.215106     4.224681     4.922211     6.366111   
min       1.000000    16.000000  1984.000000     5.000000     0.000000   
25%     665.250000    23.000000  1993.000000    11.000000     5.000000   
50%    1323.000000    26.000000  1996.000000    15.500000    11.000000   
75%    1993.500000    29.000000  1999.000000    19.000000    16.000000   
max    2687.000000    38.000000  2006.000000    23.000000    23.000000   

               Min          90s        Goals        Shots          SoT  ...  \
count  1978.000000  1978.000000  1978.000000  1978.000000  1978.000000  ...   
mean    960.107179    10.668453     1.311931     1.258726     0.415940  ...   
std     524.718825     5.830213     2.218073     1.048924     0.456647  ...   
min      19.000000     0.200000     0.000000     0.000000     0.000000  ...   
25%     515.000000     5.700000     0.000000     0.430000     0.050000  ...   
50%     945.500000    10.500000     0.000000     0.980000     0.280000  ...   
75%    1392.750000    15.500000     2.000000     1.910000     0.660000  ...   
max    2070.000000    23.000000    25.000000     6.670000     4.000000  ...   

               Off          Crs         TklW        PKwon        PKcon  \
count  1978.000000  1978.000000  1978.000000  1978.000000  1978.000000   
mean      0.201532     1.660253     0.956512     0.011871     0.015677   
std       0.355716     1.939174     0.637352     0.048743     0.050165   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.200000     0.510000     0.000000     0.000000   
50%       0.060000     0.925000     0.900000     0.000000     0.000000   
75%       0.260000     2.520000     1.320000     0.000000     0.000000   
max       5.000000    11.700000     5.000000     0.870000     0.630000   

                OG        Recov       AerWon      AerLost      AerWon%  
count  1978.000000  1978.000000  1978.000000  1978.000000  1978.000000  
mean      0.003281     4.994434     1.296891     1.399393    47.577958  
std       0.018318     1.958077     1.176698     1.158376    21.362866  
min       0.000000     0.000000     0.000000     0.000000     0.000000  
25%       0.000000     3.820000     0.490000     0.710000    33.300000  
50%       0.000000     5.090000     1.010000     1.150000    48.950000  
75%       0.000000     6.250000     1.780000     1.740000    60.000000  
max       0.290000    19.000000    12.500000    10.800000   100.000000  

[8 rows x 119 columns]
In [12]:
#Count missing values in each column and filter to show only those with missing values
missing_values = filtered_players_df.isnull().sum()
missing_values_cols = missing_values[missing_values > 0]

print("Columns with missing values and their count:")
print(missing_values_cols)
print('\n')

#Count missing values in each column
print('Missing values in each column:')
print(filtered_players_df.isnull().sum())
Columns with missing values and their count:
Series([], dtype: int64)


Missing values in each column:
Rk         0
Player     0
Nation     0
Pos        0
Squad      0
          ..
OG         0
Recov      0
AerWon     0
AerLost    0
AerWon%    0
Length: 124, dtype: int64
In [13]:
#Data is now clean, change df name to clean_df
clean_df=filtered_players_df
clean_df
Out[13]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... Off Crs TklW PKwon PKcon OG Recov AerWon AerLost AerWon%
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 0.17 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 0.05 0.18 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 0.00 1.05 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.00 0.35 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 0.00 0.23 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2678 2679 Nadir Zortea ITA DFMF Atalanta Serie A 23 1999 9 1 ... 0.00 4.00 2.00 0.0 0.0 0.00 5.60 2.00 0.40 83.3
2680 2681 Kurt Zouma FRA DF West Ham Premier League 28 1994 15 15 ... 0.00 0.00 0.14 0.0 0.0 0.00 3.94 2.61 0.35 88.1
2682 2683 Igor Zubeldia ESP DF Real Sociedad La Liga 25 1997 16 14 ... 0.00 0.21 0.86 0.0 0.0 0.00 4.93 2.00 1.57 56.0
2683 2684 Martín Zubimendi ESP MF Real Sociedad La Liga 24 1999 19 18 ... 0.06 0.11 1.07 0.0 0.0 0.00 6.18 2.19 0.90 70.9
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 0.05 2.51 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9

1978 rows × 124 columns

In [14]:
#Feature Engineering
In [15]:
#Create a new feature 'AgeGroup' to categorize players into different age groups

clean_df['AgeGroup'] = pd.cut(clean_df['Age'], bins=[0, 20, 25, 30, 35, 40], labels=['<20', '20-25', '25-30', '30-35', '35-40'])
clean_df
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\1134519217.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['AgeGroup'] = pd.cut(clean_df['Age'], bins=[0, 20, 25, 30, 35, 40], labels=['<20', '20-25', '25-30', '30-35', '35-40'])
Out[15]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... Crs TklW PKwon PKcon OG Recov AerWon AerLost AerWon% AgeGroup
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2 20-25
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 0.18 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0 30-35
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 1.05 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1 20-25
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.35 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0 20-25
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 0.23 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4 25-30
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2678 2679 Nadir Zortea ITA DFMF Atalanta Serie A 23 1999 9 1 ... 4.00 2.00 0.0 0.0 0.00 5.60 2.00 0.40 83.3 20-25
2680 2681 Kurt Zouma FRA DF West Ham Premier League 28 1994 15 15 ... 0.00 0.14 0.0 0.0 0.00 3.94 2.61 0.35 88.1 25-30
2682 2683 Igor Zubeldia ESP DF Real Sociedad La Liga 25 1997 16 14 ... 0.21 0.86 0.0 0.0 0.00 4.93 2.00 1.57 56.0 20-25
2683 2684 Martín Zubimendi ESP MF Real Sociedad La Liga 24 1999 19 18 ... 0.11 1.07 0.0 0.0 0.00 6.18 2.19 0.90 70.9 20-25
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 2.51 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9 20-25

1978 rows × 125 columns

In [16]:
#The AgeGroup variable is categorical ordinal data. We use label encoding to convert it to numerical data

#Label encoding (encoding dictionary) is used on categorical data, including ordinal data, and converts the categorical labels into unique numerical representations.
# E.g: 1. low / 2. medium / 3. high. There is an order here.

#Display all the possible values for the AgeGroup variable (use unique)
clean_df['AgeGroup'].unique()
Out[16]:
['20-25', '30-35', '25-30', '<20', '35-40']
Categories (5, object): ['<20' < '20-25' < '25-30' < '30-35' < '35-40']
In [17]:
#Create the encoding dictionary
encoding_dict = {'<20': 1, '20-25': 2, '25-30': 3, '30-35': 4, '35-40': 5}
encoding_dict
Out[17]:
{'<20': 1, '20-25': 2, '25-30': 3, '30-35': 4, '35-40': 5}
In [18]:
#Then apply it to 'Age' using pandas and store it in a ‘Age_encoded_pd’ variable

# Apply the encoding dictionary to 'can_inc_cha_ope_sea'
clean_df['Age_encoded_pd'] = clean_df['AgeGroup'].map(encoding_dict)

# Display the resulting DataFrame
clean_df
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\2690453717.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Age_encoded_pd'] = clean_df['AgeGroup'].map(encoding_dict)
Out[18]:
Rk Player Nation Pos Squad Comp Age Born MP Starts ... TklW PKwon PKcon OG Recov AerWon AerLost AerWon% AgeGroup Age_encoded_pd
0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 2000 20 19 ... 0.51 0.0 0.0 0.00 4.86 0.34 1.19 22.2 20-25 2
1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 1987 22 22 ... 1.59 0.0 0.0 0.00 6.64 2.18 1.23 64.0 30-35 4
2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 1999 14 8 ... 1.40 0.0 0.0 0.00 8.14 0.93 1.05 47.1 20-25 2
3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 2000 20 20 ... 0.80 0.0 0.0 0.05 6.60 0.50 0.50 50.0 20-25 2
4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 1993 15 15 ... 2.02 0.0 0.0 0.00 6.51 0.31 0.39 44.4 25-30 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2678 2679 Nadir Zortea ITA DFMF Atalanta Serie A 23 1999 9 1 ... 2.00 0.0 0.0 0.00 5.60 2.00 0.40 83.3 20-25 2
2680 2681 Kurt Zouma FRA DF West Ham Premier League 28 1994 15 15 ... 0.14 0.0 0.0 0.00 3.94 2.61 0.35 88.1 25-30 3
2682 2683 Igor Zubeldia ESP DF Real Sociedad La Liga 25 1997 16 14 ... 0.86 0.0 0.0 0.00 4.93 2.00 1.57 56.0 20-25 2
2683 2684 Martín Zubimendi ESP MF Real Sociedad La Liga 24 1999 19 18 ... 1.07 0.0 0.0 0.00 6.18 2.19 0.90 70.9 20-25 2
2686 2687 Martin Ødegaard NOR MF Arsenal Premier League 24 1998 20 20 ... 0.70 0.0 0.0 0.00 5.19 0.48 0.70 40.9 20-25 2

1978 rows × 126 columns

In [19]:
#I will keep the Age variable as I will use it for the visualizations below
In [20]:
#Let's visualize the data
In [21]:
#Visualization 1: Distribution of player ages

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(clean_df['Age'], kde=True, bins=20)
plt.title('Distribution of Player Ages')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
In [22]:
# Visualization 2
#Create a bar chart displaying the top 20 teams with the highest goals scored
In [23]:
#Aggregate the goals by team
goal_distribution = clean_df.groupby(['Squad', 'Comp'])['Goals'].sum().reset_index()
goal_distribution
Out[23]:
Squad Comp Goals
0 Ajaccio Ligue 1 16
1 Almería La Liga 22
2 Angers Ligue 1 15
3 Arsenal Premier League 41
4 Aston Villa Premier League 24
... ... ... ...
93 Villarreal La Liga 21
94 Werder Bremen Bundesliga 29
95 West Ham Premier League 18
96 Wolfsburg Bundesliga 35
97 Wolves Premier League 13

98 rows × 3 columns

In [24]:
#Sort the teams by goals scored
goal_distribution = goal_distribution.sort_values('Goals', ascending=False)
goal_distribution
Out[24]:
Squad Comp Goals
11 Bayern Munich Bundesliga 58
68 Paris S-G Ligue 1 54
55 Manchester City Premier League 52
63 Napoli Serie A 48
59 Monaco Ligue 1 48
... ... ... ...
24 Cádiz La Liga 13
97 Wolves Premier League 13
92 Valladolid La Liga 12
36 Hellas Verona Serie A 11
77 Sampdoria Serie A 8

98 rows × 3 columns

In [25]:
#Define a color map for each league
league_colors = {
    'Premier League': 'blue',
    'La Liga': 'red',
    'Bundesliga': 'orange',
    'Serie A': 'green',
    'Ligue 1': 'purple'
}
league_colors
Out[25]:
{'Premier League': 'blue',
 'La Liga': 'red',
 'Bundesliga': 'orange',
 'Serie A': 'green',
 'Ligue 1': 'purple'}
In [26]:
#Assign colors to each team based on their league
team_colors = [league_colors.get(league, 'gray') for league in goal_distribution['Comp']]
In [27]:
#Create custom legend items: coloured patches
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

#Create the bar chart, but only keep the top 20 teams 
plt.figure(figsize=(20, 10))
goal_distribution = goal_distribution.sort_values('Goals', ascending=False).head(20)

bars = plt.bar(goal_distribution['Squad'], goal_distribution['Goals'], color=team_colors, width=0.8)

#Adjust x-axis labels
plt.xticks(rotation=90, ha='right', fontsize=10)

#Adjust the y-axis label
plt.yticks(fontsize=10)

#Create a legend and move it outside the plot area
legend_items = [Patch(facecolor=league_colors[league], label=league) for league in league_colors]
plt.legend(handles=legend_items, title="Leagues", fontsize=20, loc='upper left', bbox_to_anchor=(1, 1))

plt.title('Goal Distribution Across Teams', fontsize=20)
plt.xlabel('Teams', fontsize=20)
plt.ylabel('Total Goals Scored', fontsize=20)
plt.tight_layout(pad=2)
plt.show()
In [28]:
#The goal distribution can be seen as fairly even across the 5 leagues
In [29]:
#Visualization 4 - Horizontal bar chart for top 5 goalscorers in each league

#Top 5 goalscorers in each league 
top_scorers = clean_df.groupby(['Comp', 'Player'])['Goals'].sum().reset_index()

#Sort the scorers within each league
top_scorers = top_scorers.sort_values(['Comp', 'Goals'], ascending=[True, False])

#Let's take the top 5 scorers from each league for the visualization
top_scorers_per_league = top_scorers.groupby('Comp').head(5)

#Plot the horizontal bar chart
plt.figure(figsize=(10, 8))

#Create a color list matching the leagues in top_scorers_per_league
#I want to keep the league colours consistent
legend = [league_colors[league] for league in top_scorers_per_league['Comp'].unique()]

#Now, use the palette in the barplot
sns.barplot(x='Goals', y='Player', hue='Comp', data=top_scorers_per_league, dodge=False, palette=legend)

plt.title('Top 5 Scorers in Each League')
plt.xlabel('Total Goals Scored')
plt.ylabel('Player')
plt.legend(title='League')
plt.tight_layout()
plt.show()
In [30]:
#The top goal scorers seem fairly even across the 5 leagues, with Erling Haaland being the only outlier
In [31]:
#The above graph shows the top goalscorers, but it doesn't factor that some leagues have played more games than others.
#Also, assists are also important metrics to show how effective players are for their team.

#Let's create a column showing goals and assists (goal contributions) per 90 minutes played (the length of a match).
#This shows who is performing well, based on how many minutes they've played overall this season.
#We'll store this in the ['GA_Per90'] variable.
In [32]:
#Visualization 5 - Goal contributions per 90 minutes

#Calculate goals and assists per 90 minutes
clean_df['GoalsPer90'] = (clean_df['Goals'] / clean_df['Min']) * 90
clean_df['AssistsPer90'] = (clean_df['Assists'] / clean_df['Min']) * 90

#Create a new column for goals and assists per 90 minutes
clean_df['GA_Per90'] = clean_df['GoalsPer90'] + clean_df['AssistsPer90']

#Filter players with a reasonable amount of minutes played
min_minutes_threshold = 900
top_players = clean_df[clean_df['Min'] > min_minutes_threshold]

#Sort the players by goals and assists per 90 minutes
top_players = top_players.sort_values('GA_Per90', ascending=False)

#We'll take the top 20 players for the visualization
top_players = top_players.head(20)

#Assign colors to each player based on their league. Notice the league_colors variable from before
player_colors = [league_colors.get(league, 'gray') for league in top_players['Comp']]

#Plot the data
plt.figure(figsize=(14, 8))
sns.barplot(x='GA_Per90', y='Player', data=top_players, palette=player_colors)

#Add a legend for leagues
legend_items = [Patch(facecolor=league_colors[league], label=league) for league in league_colors]
plt.legend(handles=legend_items, title="Leagues")

plt.title('Top Players - Goal Contributions per 90 Minutes')
plt.xlabel('G/A per 90 Minutes')
plt.ylabel('Player')
plt.show()
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\772913004.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['GoalsPer90'] = (clean_df['Goals'] / clean_df['Min']) * 90
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\772913004.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['AssistsPer90'] = (clean_df['Assists'] / clean_df['Min']) * 90
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\772913004.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['GA_Per90'] = clean_df['GoalsPer90'] + clean_df['AssistsPer90']
In [33]:
#Visualization 6 - Box Plot showing Age Distribution within each league

plt.figure(figsize=(14, 7))
sns.boxplot(x='Comp', y='Age', data=clean_df)
plt.title('Age Distribution within the 5 Leagues')
plt.xticks(rotation=45)
plt.xlabel('League')
plt.ylabel('Age')
plt.show()
In [34]:
#The ages are dispersed evenly, with the mean in La Liga being slightly higher
In [35]:
#Visualization 7 - Treemap to show different nationalities across the different leagues
import plotly.express as px

#Group by league and nationality and count the number of players
nationality_counts = clean_df.groupby(['Comp', 'Nation']).size().reset_index(name='Count')

#Plotting the TreeMap
fig = px.treemap(nationality_counts, path=['Comp', 'Nation'], values='Count', color='Nation',
                 title='Nationalities of Players in Each League',
                 color_continuous_scale='Rainbow')
fig.show()
In [36]:
#Visualization 8
#I want to create another treemap showing the unique nationality count within each league

#Group by league and count the unique number of nationalities
nationality_counts = clean_df.groupby('Comp')['Nation'].nunique().reset_index(name='Unique Nationalities')

#Since I am now interested in the count of nationalities rather than players, I adjust the values parameter
fig = px.treemap(nationality_counts, path=['Comp'], values='Unique Nationalities', 
                 title='Count of Nationalities in Each League',
                 color='Unique Nationalities', color_continuous_scale='Rainbow')

fig.show()
In [37]:
#As observed by the visual,
#Serie A has the most diversification of nationalities (63), while Premier league has the least (51).
In [38]:
#Machine Learning

#Let's use Random forest to predict Goals, based on other key variables using the variables present,
#and the variables I have created
In [39]:
#RandomForest algorithm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming 'Goals' is the target variable and other relevant features are selected
X = clean_df[['Age_encoded_pd', 'GoalsPer90', 'AssistsPer90',   'Int',  'ToSuc%', 'Carries', 'CarTotDist', ]]  # Example features
y = clean_df['Goals']  # Target variable

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)

# Initialize and train the Random Forest classifier, with 100 trees in the forest
rf = RandomForestClassifier(n_estimators=100) 
rf.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
Accuracy: 0.803030303030303
In [40]:
#Decision trees algorithm

from sklearn.tree import DecisionTreeClassifier


#Set the algorithm parameters and initialize the model
dt_model = DecisionTreeClassifier(random_state=0)

#Fit the model to the training data and make predictions on the test data
y_pred_dt = dt_model.fit(X_train, y_train).predict(X_test)

#Evaluate model performance using accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}")
Decision Tree Accuracy: 0.80
In [41]:
#Both models seem to be accurate, at around 80%
In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


#Select features and target
xdf = clean_df[['Age', 'GoalsPer90', 'AssistsPer90', 'Int', 'ToSuc%', 'Carries', 'CarTotDist']]
ydf = clean_df['Goals']  #Target variable, what I want to predict

#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(xdf, ydf, test_size=0.3, random_state=40)

#Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [43]:
#GradientBoostingRegressor algorithm

#Gradient Boosting is a boosting technique that builds one tree at a time,
#where each new tree helps to correct errors made by previously trained trees.
#The "Gradient" part refers to the use of gradient descent algorithm to minimize the loss when adding new models.

from sklearn.ensemble import GradientBoostingRegressor

#Initialize and train the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gbr_model.fit(X_train_scaled, y_train)

#Make predictions and evaluate the model
gbr_predictions = gbr_model.predict(X_test_scaled)
print("Gradient Boosting Regressor MSE:", mean_squared_error(y_test, gbr_predictions))
print("Gradient Boosting Regressor R2:", r2_score(y_test, gbr_predictions))
Gradient Boosting Regressor MSE: 0.937768508455521
Gradient Boosting Regressor R2: 0.8100557645425788
In [44]:
#Visualization 9 - Feature Importance

from sklearn.ensemble import RandomForestRegressor

#Initialize the Random Forest model
rf = RandomForestRegressor(n_estimators=100)

#Fit the model
rf.fit(X_train, y_train)

#Get feature importances
importances = rf.feature_importances_

#Print feature importances
for i, feature in enumerate(X_train.columns):
    print(f"{feature}: {importances[i]}")

features = X_train.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), features[indices], rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()
Age: 0.017294580244490917
GoalsPer90: 0.762103059321091
AssistsPer90: 0.13908120136472957
Int: 0.027288563268093452
ToSuc%: 0.018204194504547452
Carries: 0.01913214858401267
CarTotDist: 0.016896252713034764
In [45]:
#GoalsPer90 and AssistsPer90 are seen as the most important features, which are the features I created